* Uttarakhand village-matching script

*****************************************************

* Load clean census data 
use "${data}/uttarakhand_census_2001.dta", clear

*****************************************************

* Eliminate districts and villages 

// Keep districts 11 (Nainital) and 8 (Bageshwar) only
drop if dist_code ~= 11 & dist_code ~= 8 

* Code up intervention villages
destring v_ct_code, replace
merge 1:1 v_ct_code using "${data}uttarakhand_surveyed_villages.dta"
replace teri = 0 if teri == .
replace chirag = 0 if chirag == .

// Add in the codes for the 3 mis-classified villages (previously linked to Almora)
replace chirag = 1 if v_ct_code == 01127300 | v_ct_code == 01131700 | v_ct_code == 01070800

// Drop TERI-only village 
drop if teri == 1 & chirag == 0

// Restrict to villages with non-zero/non-missing census populations + flag "small" villages
drop if t_p == 0 | t_p == .
gen small = (t_hh < 40)

*****************************************************

* Generate block dummies

tab block_code if chirag == 1 | teri == 1 // only blocks 67 68 73 74 75 76 78 and 79 have intervention villages

destring block_code, replace
for any 67 68 73 74 75 76 78 79: gen blockX = 1 if block_code == X
for any 67 68 73 74 75 76 78 79: replace blockX = 0 if blockX == .

*****************************************************

* Generate other variables

gen areasq = area^2
gen sc_pct = sc_p/t_p
gen st_pct = st_p/t_p
gen density = t_p/area

*****************************************************

* Models to generate propensity scores

estimates clear

# delimit ;
fsum chirag area areasq t_p sc_pct st_pct density p_sch m_sch s_sch medi_fac h_cntr ph_cntr 
tap phone bs_fac crsoc_fac app_pr dist_town power_dom land_fores block67 block68 block73 block74 block75 
block76 block78 block79 ;
* NB: Tap and power_dom have lots of missing observations - exclude from logits

*****************************************************

* Model 1: Large models with block controls
pwcorr chirag area areasq t_p sc_pct st_pct density p_sch m_sch s_sch medi_fac h_cntr ph_cntr 
tap phone bs_fac crsoc_fac app_pr dist_town power_dom land_fores block67 block68 block73 block74 block75
block76 block78 block79, sig ;

logit chirag area areasq t_p sc_pct st_pct density p_sch m_sch s_sch medi_fac h_cntr ph_cntr 
phone bs_fac crsoc_fac app_pr dist_town land_fores block67 block68 block73 block74 block75 
block76 block78 block79 ;
estimates store model_1 ;

predict chirag_hat1 ;

preserve ;

	for any 77 80 81 82 8888: drop if block_code == X ; // Drop non-intervention blocks prior to match
	drop if small == 1 ;

	psmatch2 chirag, pscore(chirag_hat1) trim(10) ;

	pstest area areasq t_p sc_pct st_pct density p_sch m_sch s_sch medi_fac h_cntr ph_cntr 
	phone bs_fac crsoc_fac app_pr dist_town land_fores block67 block68 block73 block74 block75 
	block76 block78 block79, both hist ;
	graph save "${output}chirag_hat1_nearblocks.gph", replace ;

	# delimit cr

	sum  _pscore _pdif, detail 
	sort _id 
	gen m1id = _id 
	gen m1ctrl = _n1 
	gen m1ctrl_code = v_ct_code[m1ctrl] 
	gen m1dist = _pdif

	# delimit ;

	for any area areasq t_p sc_pct st_pct density p_sch m_sch s_sch medi_fac h_cntr ph_cntr 
	phone bs_fac crsoc_fac app_pr dist_town land_fores block_code: gen m1ctrl_X = X[m1ctrl] ;

	# delimit cr

	// Match quality diagnostics

	tab _support _treated

	# delimit ;

	outsheet v_ct_code chirag teri m1id m1ctrl m1ctrl_code m1dist area m1ctrl_area areasq m1ctrl_areasq t_p 
	m1ctrl_t_p sc_pct m1ctrl_sc_pct st_pct m1ctrl_st_pct density m1ctrl_density p_sch m1ctrl_p_sch 
	m_sch m1ctrl_m_sch s_sch m1ctrl_s_sch medi_fac m1ctrl_medi_fac h_cntr m1ctrl_h_cntr ph_cntr 
	m1ctrl_ph_cntr phone m1ctrl_phone bs_fac m1ctrl_bs_fac crsoc_fac m1ctrl_crsoc_fac app_pr 
	m1ctrl_app_pr dist_town m1ctrl_dist_town land_fores m1ctrl_land_fores block_code m1ctrl_block_code
	using "${output}ut_match1_nearblocks.csv" if m1ctrl ~=. & _support==1, comma replace  ;

	# delimit cr

	drop _pscore-_pdif
	gr drop _all

restore

*****************************************************

* Model 2: Basic model without block controls

# delimit ;

pwcorr chirag area t_p sc_pct st_pct p_sch m_sch h_cntr ph_cntr 
phone bs_fac crsoc_fac app_pr dist_town land_fores , sig ;

logit chirag area t_p sc_pct st_pct p_sch m_sch h_cntr ph_cntr 
phone bs_fac crsoc_fac app_pr dist_town land_fores ;
estimates store model_2 ;

predict chirag_hat2 ;

preserve ;

	for any 77 80 81 82 8888: drop if block_code == X ; // Drop non-intervention blocks prior to match
	drop if small == 1 ;

	psmatch2 chirag, pscore(chirag_hat2) trim(10) ;

	pstest area t_p sc_pct st_pct p_sch m_sch h_cntr ph_cntr 
	phone bs_fac crsoc_fac app_pr dist_town land_fores, both hist ;
	graph save "${output}chirag_hat2_nearblocks.gph", replace ;

	# delimit cr

	sum  _pscore _pdif, detail 
	sort _id 
	gen m2id = _id 
	gen m2ctrl = _n1 
	gen m2ctrl_code = v_ct_code[m2ctrl] 
	gen m2dist = _pdif

	# delimit ;

	for any area t_p sc_pct st_pct p_sch m_sch h_cntr ph_cntr 
	phone bs_fac crsoc_fac app_pr dist_town land_fores block_code: gen m2ctrl_X = X[m2ctrl] ;

	# delimit cr

	// Match quality diagnostics

	tab _support _treated

	# delimit ;

	outsheet v_ct_code chirag teri m2id m2ctrl m2ctrl_code m2dist area m2ctrl_area t_p 
	m2ctrl_t_p sc_pct m2ctrl_sc_pct st_pct m2ctrl_st_pct p_sch m2ctrl_p_sch 
	m_sch m2ctrl_m_sch h_cntr m2ctrl_h_cntr ph_cntr m2ctrl_ph_cntr phone m2ctrl_phone 
	bs_fac m2ctrl_bs_fac crsoc_fac m2ctrl_crsoc_fac app_pr m2ctrl_app_pr dist_town 
	m2ctrl_dist_town land_fores m2ctrl_land_fores block_code m2ctrl_block_code
	using "${output}ut_match2_nearblocks.csv" if m2ctrl ~=. & _support==1, comma replace  ;

	# delimit cr

	drop _pscore-_pdif
	gr drop _all

restore

*****************************************************

* Model 3: Basic model with tap and power_dom, without block controls

# delimit ;

logit chirag area t_p sc_pct st_pct p_sch m_sch h_cntr ph_cntr tap
phone bs_fac crsoc_fac power_all app_pr dist_town land_fores ;
estimates store model_3 ;

predict chirag_hat3 ;

preserve ;

	for any 77 80 81 82 8888: drop if block_code == X ; // Drop non-intervention blocks prior to match
	drop if small == 1 ;

	psmatch2 chirag, pscore(chirag_hat3) trim(10) ;

	pstest area t_p sc_pct st_pct p_sch m_sch h_cntr ph_cntr tap
	phone bs_fac crsoc_fac power_all app_pr dist_town land_fores, both hist ;
	graph save "${output}chirag_hat3_nearblocks.gph", replace ;

	# delimit cr

	sum  _pscore _pdif, detail 
	sort _id 
	gen m3id = _id 
	gen m3ctrl = _n1 
	gen m3ctrl_code = v_ct_code[m3ctrl] 
	gen m3dist = _pdif

	# delimit ;

	for any area t_p sc_pct st_pct p_sch m_sch h_cntr ph_cntr tap
	phone bs_fac crsoc_fac power_all app_pr dist_town land_fores block_code: gen m3ctrl_X = X[m3ctrl] ;

	* Temporary file for graph ;
	tempfile psm_graph_data ;
	save `psm_graph_data' ;

	# delimit cr

	* Match quality diagnostics *

	tab _support _treated

	# delimit ;

	outsheet v_ct_code chirag teri m3id m3ctrl m3ctrl_code m3dist area m3ctrl_area t_p 
	m3ctrl_t_p sc_pct m3ctrl_sc_pct st_pct m3ctrl_st_pct p_sch m3ctrl_p_sch 
	m_sch m3ctrl_m_sch h_cntr m3ctrl_h_cntr ph_cntr m3ctrl_ph_cntr tap m3ctrl_tap 
	phone m3ctrl_phone bs_fac m3ctrl_bs_fac crsoc_fac m3ctrl_crsoc_fac power_all
	m3ctrl_power_all app_pr m3ctrl_app_pr dist_town m3ctrl_dist_town land_fores 
	m3ctrl_land_fores block_code m3ctrl_block_code using "${output}ut_match3_nearblocks.csv" if m3ctrl ~=. 
	& _support ==1, comma replace  ;

	# delimit cr

	drop _pscore-_pdif
	gr drop _all

restore 

*****************************************************

* Generate logit results table

estout model_1 model_2 model_3 ///
	using "${results}/apptable_psm.tex", ///
	eqlabels(none) ///
	replace style(tex) ///
	cells("b(fmt(a2) star)" se(par fmt(a2))) ///
	drop(block*) ///
	stats(N r2_p, fmt(%15.0gc a2) labels("\(N\)" "Pseudo \(R^2\)")) ///
  	starlevels(* 0.10 ** 0.05 *** 0.01) ///
    collabels(none) mlabel(none) ///
	varlabels(area "Area (km\textsuperscript{2})" ///
			areasq "Area\textsuperscript{2}" ///
			t_p "Total population" ///
			sc_pct "Scheduled Caste population (proportion)" ///
			st_pct "Scheduled Tribe population (proportion)" ///
			density "Population density" ///
			p_sch "Number of primary schools" ///
			m_sch "Number of middle schools" ///
			s_sch "Number of secondary schools" ///
			medi_fac "\(\mathbbm{1}\left( \text{Medical facilities} \right)\)" ///
			h_cntr "Number of health centers" ///
			ph_cntr "Number of primary health centers" ///
			phone "Number of telephone connections" ///
			bs_fac "\(\mathbbm{1}\left( \text{Bus services} \right)\)" ///
			crsoc_fac "\(\mathbbm{1}\left( \text{Credit societies} \right)\)" ///
			app_pr "\(\mathbbm{1}\left( \text{Approach to village: paved road} \right)\)" ///
			dist_town "Distance from nearest town (km)" ///
			land_fores "Forest area (hectares)" ///
			tap "\(\mathbbm{1}\left( \text{Tap water} \right)\)" ///
			power_all "\(\mathbbm{1}\left( \text{Electricity for all purposes} \right)\)" ///
			_cons "Constant" ///
				, elist(areasq " \addlinespace" ///
						t_p " \addlinespace" ///
						sc_pct " \addlinespace" ///
						st_pct " \addlinespace" ///
						density " \addlinespace" ///
						p_sch " \addlinespace" ///
						m_sch " \addlinespace" ///
						s_sch " \addlinespace" ///
						medi_fac " \addlinespace" ///
						h_cntr " \addlinespace" ///
						ph_cntr " \addlinespace" ///
						phone " \addlinespace" ///
						bs_fac " \addlinespace" ///
						crsoc_fac " \addlinespace" ///
						app_pr " \addlinespace" ///
						dist_town " \addlinespace" ///
						land_fores " \addlinespace" ///
						tap " \addlinespace") ///
				) ///
	prehead(  ///
		"\begin{table}[h!]" ///
		"\centering" ///
		"\caption{Propensity-score estimation using logistic regression}" ///
		"\label{app:psmatch_table}" ///
		"\resizebox*{!}{\dimexpr0.9\textheight-2\baselineskip\relax}{%" ///
		"\begin{threeparttable}" ///
		"\begin{tabular}{l*{3}{c}}" ///
		"\toprule" ///
		"\multirow{2}{*}{Village-level characteristic} & (1) & (2) & (3) \\ \cmidrule(lr){2-4}" ///
		"& \multicolumn{3}{c}{\(\mathbbm{1}\left( \text{NGO village} \right)\)} \\" ///
		"\midrule" ///
		)  ///
	prefoot("\midrule" ///
		"Sub-district fixed effects & Yes & No & No \\" ///
		)  ///
	postfoot("\bottomrule" ///
		"\end{tabular}" ///
		"\begin{tablenotes}" ///
		"{\setlength\labelsep{0pt}" ///
		"\footnotesize" ///
		"\item \textit{Notes}. This table presents results from logistic regressions of an indicator for whether our partner NGO had operated in village \(i\) in the past---represented by \(\mathbbm{1}\left( \text{NGO village} \right)\)---on a set of village-level characteristics from the 2001 round of the Indian Census. Standard errors in parentheses. Our final model is shown in column (3). For this model, we initially restrict our sample to all Census-designated villages in the Bageshwar and Nainital districts of the state of Uttarakhand with non-zero or non-missing values for total population. We then exclude six villages where pretesting activities occurred with an alternative NGO partner (details available upon request). The final estimation sample for the model presented in column (3), thus, consists of all remaining villages with non-missing values for the village-level characteristics used for estimation. \sym{*} \(p<0.10\), \sym{**} \(p<0.05\), \sym{***} \(p<0.01\).}" ///
		"\end{tablenotes}" ///
		"\end{threeparttable}" ///
		"}" ///
		"\end{table}" ///
		)

*****************************************************

* Generate PSM match figure

use `psm_graph_data', clear

// compare _pscores before matching & save graph to disk
twoway (kdensity _pscore if _treated==1, bwidth(0.0175)) ///
	(kdensity _pscore if _treated==0, ///
		lpattern(dash) bwidth(0.0175)), legend( label(1 "NGO") label( 2 "Non-NGO" ) col(1)) ///
		xtitle("") ytitle("") title("(a) Pre-match") name(before, replace) nodraw

// compare _pscores *after* matching & save graph to disk
gen match=_n1
replace match=_id if match==.
duplicates tag match, gen(dup)
twoway (kdensity _pscore if _treated==1, bwidth(0.0175)) ///
	(kdensity _pscore if _treated==0 ///
		& dup>0, lpattern(dash) bwidth(0.0175)), legend( label( 1 "NGO") label( 2 "Non-NGO" )) ///
		xtitle("") ytitle("") title("(b) Post-match") name(after, replace) nodraw

// combine these two graphs that were saved to disk
// put both graphs on y axes with common scales
grc1leg before after, ycommon xcommon ///
	legendfrom(before) pos(3) cols(1) ///
	b1("Propensity score") l1("Density") name(combined_psm, replace)

gr disp combined_psm, ysize(12) xsize(10)

graph export "${results}figure_psmatch_distribution.png", width(4000) replace
gr drop _all

* Count of sample being presented in each graph
tab _treated
sum _pscore if _treated == 0 & dup > 0

